########################################################################################################################################################
# May 01 2020
# Replication File
# Rebecca Cordell, Kristian Skrede Gleditsch, Florian G. Kern, Laura Saavedra-Lux
# Measuring Institutional Variation Across American Indian Constitutions Using Automated Content Analysis
# Journal of Peace Research
# https://journals.sagepub.com/home/jpr  
########################################################################################################################################################

# Clear workspace
rm(list=ls()) 

# Remove scientific notation
options(scipen=999)

# Install packages
#install.packages("ggplot2")
#install.packages("tm")
#install.packages("caret")
#install.packages("mboost")
#install.packages("naive_bayes")

# Required packages
require(ggplot2)
require(tm)
require(caret)
require(mboost)
require(naivebayes)

# Load data 
constitutions<-read.csv("constitutions.csv", header=TRUE)

##############################################################################################################################################
# Figure 1. Constitutions by Year of Adoption
##############################################################################################################################################

# Create graph
hist(constitutions$year,main=NULL,xlab="Year",ylab="Constitution", breaks="FD", col = "grey35")

##############################################################################################################################################
# Figure 2. Constitution length, measured as total word count of a constitution
##############################################################################################################################################

# Calculate word count
# Create document term matrix
const_corpus <- Corpus(VectorSource(constitutions$const.dict.text))
const_dtm <- DocumentTermMatrix(const_corpus, list(wordLengths = c(1, Inf)))
const_words<-rowSums(as.matrix(const_dtm)) # const_words = constitutions$const.words

# Create figure
hist(const_words,main=NULL,xlab="Total number of words",ylab="Tribes", col = "grey35")

##############################################################################################################################################
# Figure 3. Constitution length (measured as average word count), by decade of adoption
##############################################################################################################################################

# Calculate average word count by decade
const_decade<-setNames(aggregate(constitutions$const.words, by=list(Category=constitutions$decade), FUN=mean), c("year", "ave_words"))

# Create figure
ggplot(const_decade, aes(x=year, y=ave_words)) + 
  geom_col(fill= "grey35") + 
  scale_x_discrete(limits = const_decade$year) +
  theme_classic() + xlab("Year") + ylab("Average constitution length")

##############################################################################################################################################
# Figure 4. Length of the section on the judiciary, measured as total word count
##############################################################################################################################################

# Calculate word count
# Create document term matrix
judiciary.sect.corpus <- Corpus(VectorSource(constitutions$judiciary.sect.dict.text))
judiciary.sect.dtm <- DocumentTermMatrix(judiciary.sect.corpus, list(wordLengths = c(1, Inf)))
judiciary_sect_words<-rowSums(as.matrix(judiciary.sect.dtm)) # judiciary_sect_words = constitutions$judiciary.sect.words

# Create figure
hist(judiciary_sect_words,main=NULL,xlab="Total number of words",ylab="Tribes", col = "grey35")

##############################################################################################################################################
# Figure 5. Terms associated with the judiciary included in a constitution, measured as total word count
##############################################################################################################################################

# Calculate word count of dictionary terms
# Create document term matrix
judiciary_dict <- c("adjud","adjudg","bylaw","court","judg","judici","judiciari","jurisdict","justic","law","lawandord","prejudici","quasijudici","unlaw")
judiciary_dict_dtm <- DocumentTermMatrix(const_corpus, list(wordLengths = c(1, Inf), dictionary=judiciary_dict))
judiciary_dict_words<-rowSums(as.matrix(judiciary_dict_dtm)) # judiciary_dict_words = constitutions$judiciary.dict.words

# Create figure
hist(judiciary_dict_words,main=NULL,xlab="Total number of words",ylab="Tribes", col = "grey35")

##############################################################################################################################################
# Figure 6. Terms associated with an independent judiciary included in the judicial section of a constitution, measured as total word count
##############################################################################################################################################

# Calculate word count of dictionary terms
# Create document term matrix
independ_dict <- c("appoint","appointe","author","compens","decid","decis","disqualif","disqualifi","elect","elector","elig","independ","inelig","power","qualif","qualifi","reappoint","remov","salari","select","serv","term","vote","voter")
independ_dict_dtm <- DocumentTermMatrix(judiciary.sect.corpus, list(wordLengths = c(1, Inf), dictionary=independ_dict))
independ_dict_words<-rowSums(as.matrix(independ_dict_dtm)) # independ_dict_words = constitutions$independ.dict.words

# Create figure
hist(independ_dict_words,main=NULL,xlab="Total number of words",ylab="Tribes", col = "grey35")

##############################################################################################################################################
# Table I. dict terms included in the judiciary dictionary
##############################################################################################################################################

# List judiciary dictionary terms
judiciary_dict

##############################################################################################################################################
# Table II. dict terms included in the judicial independence dictionary
##############################################################################################################################################

# List independent dictionary terms
independ_dict

##############################################################################################################################################
# Table III. Percentage agreement, precision and recall for the JUDICIARY and INDEPEND replication variable using a dictionary approach
##############################################################################################################################################

# Subset constitutions in our sample that overlap with Cornell & Kalt’s (2000) data
const_overlap<-constitutions[constitutions$const.overlap.ck==1,]

# Create JUDICIARY replication variable using a dictionary approach
# Create document term matrix
const_overlap_corpus <- Corpus(VectorSource(const_overlap$const.dict.text))
judiciary_repl_dict <- c("adjud","adjudg","court","judg","judici","judiciari","prejudici","quasijudici")
judiciary_repl_dict_dtm <- DocumentTermMatrix(const_overlap_corpus, list(wordLengths = c(1, Inf), dictionary=judiciary_repl_dict))
judiciary_repl_dict<-rowSums(as.matrix(judiciary_repl_dict_dtm)) 
judiciary_repl_dict <- as.vector(ifelse(judiciary_repl_dict>=1, 1, 0)) # judiciary_repl_dict = constitutions$judiciary.repl.dict

# Calculate the agreement, precision and recall for the JUDICIARY variable
# Create confusion matrix
confusion_matrix_judiciary_repl_dict<-table(judiciary_repl_dict,const_overlap$judiciary.ck)
zero <- matrix(c(0,0),ncol=2,byrow=TRUE)
colnames(zero) <- c("0","1")
rownames(zero) <- c("0")
zero <- as.table(zero)
confusion_matrix_judiciary_repl_dict<-rbind(zero,confusion_matrix_judiciary_repl_dict) 

## Calculate true positive and true negative rate
TP_judiciary_repl_dict <- confusion_matrix_judiciary_repl_dict[2,2]
TN_judiciary_repl_dict <- confusion_matrix_judiciary_repl_dict[1,1]
## Calculate false positive and true negative rate
FP_judiciary_repl_dict <- confusion_matrix_judiciary_repl_dict[1,2]
FN_judiciary_repl_dict <- confusion_matrix_judiciary_repl_dict[2,1]
## Calculate accuracy
accuracy_judiciary_repl_dict <- (TP_judiciary_repl_dict + TN_judiciary_repl_dict) / (TP_judiciary_repl_dict + TN_judiciary_repl_dict + FP_judiciary_repl_dict + FN_judiciary_repl_dict)
## Calculate false negative rate
false_negative_rate_judiciary_repl_dict <- FN_judiciary_repl_dict / (TP_judiciary_repl_dict + TN_judiciary_repl_dict + FP_judiciary_repl_dict + FN_judiciary_repl_dict)
## Calculate false positive rate
false_positive_rate_judiciary_repl_dict <- FP_judiciary_repl_dict / (TP_judiciary_repl_dict + TN_judiciary_repl_dict + FP_judiciary_repl_dict + FN_judiciary_repl_dict)
## Calculate precision
precision_judiciary_repl_dict <- (TP_judiciary_repl_dict) / (TP_judiciary_repl_dict + FP_judiciary_repl_dict)
## Calculate recall
recall_judiciary_repl_dict <- (TP_judiciary_repl_dict) / (TP_judiciary_repl_dict + FN_judiciary_repl_dict)

# Create INDEPEND replication variable using a dictionary approach
# Create document term matrix
judiciary_sect_overlap_corpus <- Corpus(VectorSource(const_overlap$judiciary.sect.dict.text))
independ_repl_dict <- c("independ")
independ_repl_dict_dtm <- DocumentTermMatrix(judiciary_sect_overlap_corpus, list(wordLengths = c(1, Inf), dictionary=independ_repl_dict))
independ_repl_dict<-rowSums(as.matrix(independ_repl_dict_dtm)) 
independ_repl_dict <- as.vector(ifelse(independ_repl_dict>=1, 1, 0)) # independ_repl_dict = constitutions$independ.repl.dict

# Calculate the agreement, precision and recall for the INDEPEND variable
# Create confusion matrix
confusion_matrix_independ_dict<-table(independ_repl_dict,const_overlap$independ.ck)

## Calculate true positive and true negative rate
TP_independ_dict <- confusion_matrix_independ_dict[2,2]
TN_independ_dict <- confusion_matrix_independ_dict[1,1]
## Calculate false positive and true negative rate
FP_independ_dict <- confusion_matrix_independ_dict[1,2]
FN_independ_dict <- confusion_matrix_independ_dict[2,1]
## Calculate accuracy
accuracy_independ_dict <- (TP_independ_dict + TN_independ_dict) / (TP_independ_dict + TN_independ_dict + FP_independ_dict + FN_independ_dict)
## Calculate false negative rate
false_negative_rate_independ_dict <- FN_independ_dict / (TP_independ_dict + TN_independ_dict + FP_independ_dict + FN_independ_dict)
## Calculate false positive rate
false_positive_rate_independ_dict <- FP_independ_dict / (TP_independ_dict + TN_independ_dict + FP_independ_dict + FN_independ_dict)
## Calculate precision
precision_independ_dict <- (TP_independ_dict) / (TP_independ_dict + FP_independ_dict)
## Calculate recall
recall_independ_dict <- (TP_independ_dict) / (TP_independ_dict + FN_independ_dict)

# Create table
accuracy_judiciary_repl_dict<-accuracy_judiciary_repl_dict*100
accuracy_independ_dict<-accuracy_independ_dict*100
judiciary_repl_dict_res<-cbind(accuracy_judiciary_repl_dict,precision_judiciary_repl_dict,recall_judiciary_repl_dict)
independ_dict_res<-cbind(accuracy_independ_dict,precision_independ_dict,recall_independ_dict)
table_III<-as.data.frame(rbind(judiciary_repl_dict_res,independ_dict_res))
colnames(table_III) <- c("Percentage Agreement", "Precision", "Recall")
rownames(table_III) <- c("JUDICIARY", "INDEPEND")
round(table_III, 3)

##############################################################################################################################################
# Table IV. Model accuracy, precision and recall for the JUDICIARY replication variable using supervised machine learning
##############################################################################################################################################

# Create JUDICIARY replication variable using supervised machine learning
# Create document term matrix
const_overlap_corpus <- Corpus(VectorSource(const_overlap$const.sml.text))
judiciary_repl_sml_dtm <- DocumentTermMatrix(const_overlap_corpus)
judiciary_repl_sml_dtm<-as.data.frame(as.matrix(judiciary_repl_sml_dtm))

# Split data into training and test data
set.seed(3456)
judiciary.ck<-as.factor(const_overlap$judiciary.ck)
judiciary_repl_sml_dtm<-cbind(judiciary_repl_sml_dtm,judiciary.ck)
judiciary_repl_sml_inTraining <- createDataPartition(judiciary_repl_sml_dtm$judiciary.ck, p = .7, list = FALSE, times=1)
judiciary_repl_sml_train <- judiciary_repl_sml_dtm[ judiciary_repl_sml_inTraining,]
judiciary_repl_sml_test  <- judiciary_repl_sml_dtm[-judiciary_repl_sml_inTraining,]

# Tune parameters of sml models 
fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 5)

# Train logistic regression model
set.seed(825)
judiciary_repl_sml_logistic <- train(judiciary.ck ~., data = judiciary_repl_sml_train, 
                                       method = "glmboost", 
                                       trControl = fitControl)

# Train naive bayes model
set.seed(825)
judiciary_repl_sml_nb <- train(judiciary.ck ~., data = judiciary_repl_sml_train, 
                                 method = "naive_bayes", 
                                 trControl = fitControl)

# Train support vector machine model
set.seed(825)
judiciary_repl_sml_svm <- train(judiciary.ck ~., data = judiciary_repl_sml_train, 
                                  method = "svmPoly", 
                                  trControl = fitControl, scale = FALSE)

# Predict values of test data
# Predict using logistic regression model
judiciary_repl_sml_test_pred_logistic<-predict(judiciary_repl_sml_logistic, newdata = judiciary_repl_sml_test) # judiciary_repl_sml_test_pred_logistic = constitutions$judiciary.repl.sml.logistic

# Predict using naive bayes model
judiciary_repl_sml_test_pred_nb<-predict(judiciary_repl_sml_nb, newdata = judiciary_repl_sml_test) # judiciary_repl_sml_test_pred_logistic = constitutions$judiciary.repl.sml.nb

# Predict using support vector machine model
judiciary_repl_sml_test_pred_svm<-predict(judiciary_repl_sml_svm, newdata = judiciary_repl_sml_test) # judiciary_repl_sml_test_pred_logistic = constitutions$judiciary.repl.sml.svm

# Calculate the agreement, precision and recall for the JUDICIARY variable, using the logistic regression model
# Create confusion matrix
confusion_matrix_judiciary_logistic<-table(judiciary_repl_sml_test_pred_logistic,judiciary_repl_sml_test$judiciary.ck)
## Calculate true positive and true negative rate
TP_judiciary_logistic <- confusion_matrix_judiciary_logistic[2,2]
TN_judiciary_logistic <- confusion_matrix_judiciary_logistic[1,1]
## Calculate false positive and true negative rate
FP_judiciary_logistic <- confusion_matrix_judiciary_logistic[1,2]
FN_judiciary_logistic <- confusion_matrix_judiciary_logistic[2,1]
## Calculate accuracy
accuracy_judiciary_logistic <- (TP_judiciary_logistic + TN_judiciary_logistic) / (TP_judiciary_logistic + TN_judiciary_logistic + FP_judiciary_logistic + FN_judiciary_logistic)
## Calculate false negative rate
false_negative_rate_judiciary_logistic <- FN_judiciary_logistic / (TP_judiciary_logistic + TN_judiciary_logistic + FP_judiciary_logistic + FN_judiciary_logistic)
## Calculate false positive rate
false_positive_rate_judiciary_logistic <- FP_judiciary_logistic / (TP_judiciary_logistic + TN_judiciary_logistic + FP_judiciary_logistic + FN_judiciary_logistic)
## Calculate precision
precision_judiciary_logistic <- (TP_judiciary_logistic) / (TP_judiciary_logistic + FP_judiciary_logistic)
## Calculate recall
recall_judiciary_logistic <- (TP_judiciary_logistic) / (TP_judiciary_logistic + FN_judiciary_logistic)

# Calculate the agreement, precision and recall for the JUDICIARY variable, using the naive bayes model
# Create Confusion Matrix
confusion_matrix_judiciary_nb<-table(judiciary_repl_sml_test_pred_nb,judiciary_repl_sml_test$judiciary.ck)

## Calculate true positive and true negative rate
TP_judiciary_nb <- confusion_matrix_judiciary_nb[2,2]
TN_judiciary_nb <- confusion_matrix_judiciary_nb[1,1]
## Calculate false positive and true negative rate
FP_judiciary_nb <- confusion_matrix_judiciary_nb[1,2]
FN_judiciary_nb <- confusion_matrix_judiciary_nb[2,1]
## Calculate accuracy
accuracy_judiciary_nb <- (TP_judiciary_nb + TN_judiciary_nb) / (TP_judiciary_nb + TN_judiciary_nb + FP_judiciary_nb + FN_judiciary_nb)
## Calculate false negative rate
false_negative_rate_judiciary_nb <- FN_judiciary_nb / (TP_judiciary_nb + TN_judiciary_nb + FP_judiciary_nb + FN_judiciary_nb)
## Calculate false positive rate
false_positive_rate_judiciary_nb <- FP_judiciary_nb / (TP_judiciary_nb + TN_judiciary_nb + FP_judiciary_nb + FN_judiciary_nb)
## Calculate precision
precision_judiciary_nb <- (TP_judiciary_nb) / (TP_judiciary_nb + FP_judiciary_nb)
## Calculate recall
recall_judiciary_nb <- (TP_judiciary_nb) / (TP_judiciary_nb + FN_judiciary_nb)

# Calculate the agreement, precision and recall for the JUDICIARY variable, using the support vector machine model
# Create Confusion Matrix
confusion_matrix_judiciary_svm<-table(judiciary_repl_sml_test_pred_svm,judiciary_repl_sml_test$judiciary.ck)
## Calculate true positive and true negative rate
TP_judiciary_svm <- confusion_matrix_judiciary_svm[2,2]
TN_judiciary_svm <- confusion_matrix_judiciary_svm[1,1]
## Calculate false positive and true negative rate
FP_judiciary_svm <- confusion_matrix_judiciary_svm[1,2]
FN_judiciary_svm <- confusion_matrix_judiciary_svm[2,1]
## Calculate accuracy
accuracy_judiciary_svm <- (TP_judiciary_svm + TN_judiciary_svm) / (TP_judiciary_svm + TN_judiciary_svm + FP_judiciary_svm + FN_judiciary_svm)
## Calculate false negative rate
false_negative_rate_judiciary_svm <- FN_judiciary_svm / (TP_judiciary_svm + TN_judiciary_svm + FP_judiciary_svm + FN_judiciary_svm)
## Calculate false positive rate
false_positive_rate_judiciary_svm <- FP_judiciary_svm / (TP_judiciary_svm + TN_judiciary_svm + FP_judiciary_svm + FN_judiciary_svm)
## Calculate precision
precision_judiciary_svm <- (TP_judiciary_svm) / (TP_judiciary_svm + FP_judiciary_svm)
## Calculate recall
recall_judiciary_svm <- (TP_judiciary_svm) / (TP_judiciary_svm + FN_judiciary_svm)

# Create table
judiciary_logistic<-cbind(accuracy_judiciary_logistic,precision_judiciary_logistic,recall_judiciary_logistic)
judiciary_nb<-cbind(accuracy_judiciary_nb,precision_judiciary_nb,recall_judiciary_nb)
judiciary_svm<-cbind(accuracy_judiciary_svm,precision_judiciary_svm,recall_judiciary_svm)
table_IV<-as.data.frame(rbind(judiciary_logistic,judiciary_nb,judiciary_svm))
colnames(table_IV) <- c("Accuracy", "Precision", "Recall")
rownames(table_IV) <- c("Logistic Regression", "Naive Bayes", "Support Vector Machine")
round(table_IV, 3)

##############################################################################################################################################
# Table V. Model accuracy, precision and recall for the INDEPEND replication variable using supervised machine learning
##############################################################################################################################################

# Create INDEPEND replication variable using supervised machine learning
# Create document term matrix
const_overlap_judiciary_sect_corpus <- Corpus(VectorSource(const_overlap$judiciary.sect.sml.text))
independ_repl_sml_dtm <- DocumentTermMatrix(const_overlap_judiciary_sect_corpus)
independ_repl_sml_dtm<-as.data.frame(as.matrix(independ_repl_sml_dtm))

# Split data into training and test data
set.seed(3456)
independ.ck<-as.factor(const_overlap$independ.ck)
independ_repl_sml_dtm<-cbind(independ_repl_sml_dtm,independ.ck)
independ_repl_sml_inTraining <- createDataPartition(independ_repl_sml_dtm$independ.ck, p = .7, list = FALSE, times=1)
independ_repl_sml_train <- independ_repl_sml_dtm[ independ_repl_sml_inTraining,]
independ_repl_sml_test  <- independ_repl_sml_dtm[-independ_repl_sml_inTraining,]

# Train logistic regression model
set.seed(825)
independ_repl_sml_logistic <- train(independ.ck ~., data = independ_repl_sml_train, 
                                     method = "glmboost", 
                                     trControl = fitControl)
# Warning message:
# In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
# There were missing values in resampled performance measures.

# Train naive bayes model
set.seed(825)
independ_repl_sml_nb <- train(independ.ck ~., data = independ_repl_sml_train, 
                               method = "naive_bayes", 
                               trControl = fitControl)
# Warning message:
# In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
# There were missing values in resampled performance measures.

# Train support vector machine model
set.seed(825)
independ_repl_sml_svm <- train(independ.ck ~., data = independ_repl_sml_train, 
                                method = "svmPoly", 
                                trControl = fitControl, scale=FALSE)
# Warning message:
# In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  # There were missing values in resampled performance measures.

# Predict values of test data
# Predict using logistic regression model
independ_repl_sml_test_pred_logistic<-predict(independ_repl_sml_logistic, newdata = independ_repl_sml_test) # independ_repl_sml_test_pred_logistic = constitutions$independ.repl.sml.logistic

# Predict using naive bayes model
independ_repl_sml_test_pred_nb<-predict(independ_repl_sml_nb, newdata = independ_repl_sml_test) # independ_repl_sml_test_pred_nb = constitutions$independ.repl.sml.nb

# Predict using support vector machine model
independ_repl_sml_test_pred_svm<-predict(independ_repl_sml_svm, newdata = independ_repl_sml_test) # independ_repl_sml_test_pred_svm = constitutions$independ.repl.sml.svm

# Calculate the agreement, precision and recall for the independ variable, using the logistic regression model
# Create confusion matrix
confusion_matrix_independ_logistic<-table(independ_repl_sml_test_pred_logistic,independ_repl_sml_test$independ.ck)
## Calculate true positive and true negative rate
TP_independ_logistic <- confusion_matrix_independ_logistic[2,2]
TN_independ_logistic <- confusion_matrix_independ_logistic[1,1]
## Calculate false positive and true negative rate
FP_independ_logistic <- confusion_matrix_independ_logistic[1,2]
FN_independ_logistic <- confusion_matrix_independ_logistic[2,1]
## Calculate accuracy
accuracy_independ_logistic <- (TP_independ_logistic + TN_independ_logistic) / (TP_independ_logistic + TN_independ_logistic + FP_independ_logistic + FN_independ_logistic)
## Calculate false negative rate
false_negative_rate_independ_logistic <- FN_independ_logistic / (TP_independ_logistic + TN_independ_logistic + FP_independ_logistic + FN_independ_logistic)
## Calculate false positive rate
false_positive_rate_independ_logistic <- FP_independ_logistic / (TP_independ_logistic + TN_independ_logistic + FP_independ_logistic + FN_independ_logistic)
## Calculate precision
precision_independ_logistic <- (TP_independ_logistic) / (TP_independ_logistic + FP_independ_logistic)
## Calculate recall
recall_independ_logistic <- (TP_independ_logistic) / (TP_independ_logistic + FN_independ_logistic)

# Calculate the agreement, precision and recall for the independ variable, using the naive bayes model
# Create Confusion Matrix
confusion_matrix_independ_nb<-table(independ_repl_sml_test_pred_nb,independ_repl_sml_test$independ.ck)

## Calculate true positive and true negative rate
TP_independ_nb <- confusion_matrix_independ_nb[2,2]
TN_independ_nb <- confusion_matrix_independ_nb[1,1]
## Calculate false positive and true negative rate
FP_independ_nb <- confusion_matrix_independ_nb[1,2]
FN_independ_nb <- confusion_matrix_independ_nb[2,1]
## Calculate accuracy
accuracy_independ_nb <- (TP_independ_nb + TN_independ_nb) / (TP_independ_nb + TN_independ_nb + FP_independ_nb + FN_independ_nb)
## Calculate false negative rate
false_negative_rate_independ_nb <- FN_independ_nb / (TP_independ_nb + TN_independ_nb + FP_independ_nb + FN_independ_nb)
## Calculate false positive rate
false_positive_rate_independ_nb <- FP_independ_nb / (TP_independ_nb + TN_independ_nb + FP_independ_nb + FN_independ_nb)
## Calculate precision
precision_independ_nb <- (TP_independ_nb) / (TP_independ_nb + FP_independ_nb)
## Calculate recall
recall_independ_nb <- (TP_independ_nb) / (TP_independ_nb + FN_independ_nb)

# Calculate the agreement, precision and recall for the independ variable, using the support vector machine model
# Create Confusion Matrix
confusion_matrix_independ_svm<-table(independ_repl_sml_test_pred_svm,independ_repl_sml_test$independ.ck)
## Calculate true positive and true negative rate
TP_independ_svm <- confusion_matrix_independ_svm[2,2]
TN_independ_svm <- confusion_matrix_independ_svm[1,1]
## Calculate false positive and true negative rate
FP_independ_svm <- confusion_matrix_independ_svm[1,2]
FN_independ_svm <- confusion_matrix_independ_svm[2,1]
## Calculate accuracy
accuracy_independ_svm <- (TP_independ_svm + TN_independ_svm) / (TP_independ_svm + TN_independ_svm + FP_independ_svm + FN_independ_svm)
## Calculate false negative rate
false_negative_rate_independ_svm <- FN_independ_svm / (TP_independ_svm + TN_independ_svm + FP_independ_svm + FN_independ_svm)
## Calculate false positive rate
false_positive_rate_independ_svm <- FP_independ_svm / (TP_independ_svm + TN_independ_svm + FP_independ_svm + FN_independ_svm)
## Calculate precision
precision_independ_svm <- (TP_independ_svm) / (TP_independ_svm + FP_independ_svm)
## Calculate recall
recall_independ_svm <- (TP_independ_svm) / (TP_independ_svm + FN_independ_svm)

# Create table
independ_logistic<-cbind(accuracy_independ_logistic,precision_independ_logistic,recall_independ_logistic)
independ_nb<-cbind(accuracy_independ_nb,precision_independ_nb,recall_independ_nb)
independ_svm<-cbind(accuracy_independ_svm,precision_independ_svm,recall_independ_svm)
table_V<-as.data.frame(rbind(independ_logistic,independ_nb,independ_svm))
colnames(table_V) <- c("Accuracy", "Precision", "Recall")
rownames(table_V) <- c("Logistic Regression", "Naive Bayes", "Support Vector Machine")
round(table_V, 3)

##############################################################################################################################################
# Table A1: Excerpts from three American Indian constitutions regarding the structure of judicial institutions
##############################################################################################################################################
# Created by hand

##############################################################################################################################################
# Table A2: Correlation matrix for the dictionary measures, using a Pearson correlation coefficient
##############################################################################################################################################
dict.measures<-as.data.frame(cbind(constitutions$const.words, constitutions$judiciary.sect.words, constitutions$judiciary.dict.words, constitutions$independ.dict.words))
table_A2<-cor(dict.measures, method = c("pearson"))
colnames(table_A2) <- c("Constitution Length", "Judicial Section Length", "Judiciary Dictionary", "Judicial Independence Dictionary")
rownames(table_A2) <- c("Constitution Length", "Judicial Section Length", "Judiciary Dictionary", "Judicial Independence Dictionary")
round(table_A2, 3)

##############################################################################################################################################
# Descriptive statistics in main text
##############################################################################################################################################
# The oldest active constitution, approved in 1935, is the Santa Clara Pueblo, New Mexico, and the most recent constitution, approved in 2010, is the Absentee-Shawnee Tribe of Oklahoma, who we record as amending their original constitution twice. 
min(constitutions$year)
const_1935<-constitutions[constitutions$year=="1935",]
const_1935$tribe
max(constitutions$year)
const_2010<-constitutions[constitutions$year=="2010",]
const_2010$tribe[1]
# Number of amendments calculated by hand

# Most constitutions in our sample were adopted during the 1980s through to the 2000s (67 constitutions, 69%) – with the greatest number adopted in the 2000s (31 constitutions, 32%), and the lowest number adopted in the 1950s and 1910 (3 constitutions each, 3%).
table(constitutions$decade)
12+24+31
round(67/97*100, 0)
round(31/97*100, 0)
round(3/97*100, 0)

# For example, the St. Croix Chippewa Indians of Wisconsin has the shortest constitution in our sample (approved in 1942) with only 1,423 words, seven articles and three bylaws. 
min(constitutions$const.words)
const_1423<-constitutions[constitutions$const.words=="1423",]
const_1423$tribe
const_1423$year
# Number of articles and bylaws calculated by hand

# By contrast, the 1992 Menominee Indian Tribe of Wisconsin constitution is nearly thirteen times longer, with 18,406 words based on 20 articles and six bylaws. 
max(constitutions$const.words)
const_18406<-constitutions[constitutions$const.words=="18406",]
const_18406$tribe
const_18406$year
round(18406/1423,0)

# Number of articles and bylaws calculated by hand

# The average length of a constitution in our sample is 5,410 words.
round(mean(constitutions$const.words),0)

# On average, constitutions adopted from 1980-2010 were 28% longer than constitutions adopted during the earlier years in our sample.
const_1980s_2010<-constitutions[constitutions$year>="1980" & constitutions$year<="2010",]
const_1930s_1970s<-constitutions[constitutions$year>="1930" & constitutions$year<="1979",]
mean(const_1980s_2010$const.words)
mean(const_1930s_1970$const.words)
round((5864.686-4229.778)/5864.686*100,0)

# In general, we observe a steady increase in average length over time – with constitutions produced in 2010 on average 20% longer than constitutions produced during the 1930s. 
const_2010<-constitutions[constitutions$decade=="2010",]
const_1930s<-constitutions[constitutions$decade=="1930s",]
mean(const_2010$const.words)
mean(const_1930s$const.words)
round((6030-4838.857)/6030*100,0)

# However, this growth is not entirely consistent as constitutions adopted during the 1930s in our sample were on average 29-40% longer than those adopted during the 1940s and 1950s. 
const_1940s<-constitutions[constitutions$decade=="1940s",]
const_1950s<-constitutions[constitutions$decade=="1950s",]
mean(const_1950s$const.words)
mean(const_1930s$const.words)
round((4838.857-3426)/4838.857*100,0)
mean(const_1930s$const.words)
mean(const_1940s$const.words)
round((4838.857-2919.2)/4838.857*100,0)

# Using the Wilcoxon signed-rank test, we find a statistically significant increase in the average word count from constitutions produced over 1935-1969 to those produced during 1976-2010 at the 99% confidence level. 
const_1930s_1960s<-constitutions[constitutions$year>="1930" & constitutions$year<="1969",]
const_1970s_2010<-constitutions[constitutions$year>="1970" & constitutions$year<="2010",]
wilcox.test(const_1930s_1960s$const.words, const_1970s_2010$const.words, paired=FALSE) 

# However, the change in average word count is only statistically significant at the 99% confidence level for constitutions produced during the 1930s and 1940s.
const_1960s<-constitutions[constitutions$decade=="1960s",]
const_1970s<-constitutions[constitutions$decade=="1970s",]
const_1980s<-constitutions[constitutions$decade=="1980s",]
const_1990s<-constitutions[constitutions$decade=="1990s",]
const_2000s<-constitutions[constitutions$decade=="2000s",]
wilcox.test(const_1930s$const.words, const_1940s$const.words, paired=FALSE) 
wilcox.test(const_1940s$const.words, const_1950s$const.words, paired=FALSE) 
wilcox.test(const_1950s$const.words, const_1960s$const.words, paired=FALSE) 
wilcox.test(const_1960s$const.words, const_1970s$const.words, paired=FALSE) 
wilcox.test(const_1970s$const.words, const_1980s$const.words, paired=FALSE) 
wilcox.test(const_1980s$const.words, const_1990s$const.words, paired=FALSE) 
wilcox.test(const_1990s$const.words, const_2000s$const.words, paired=FALSE) 
wilcox.test(const_2000s$const.words, const_2010$const.words, paired=FALSE) 

# For example, 52% of constitutions do not have a dedicated section on the judiciary. 
summary(constitutions$judiciary.sect.words==0)
round(50/97*100,0)

# The longest section on the judiciary in our sample is the Turtle Mountain Band of Chippewa Indians of North Dakota’s constitution (approved in 2006) which consists of 1,766 words – containing information on the purpose of the judiciary, the establishment, judicial powers, the selection of judges, the judicial board, implementation and saving clause and reservation powers of the tribal council. 
max(constitutions$judiciary.sect.words)
const_1766<-constitutions[constitutions$judiciary.sect.words=="1766",]
const_1766$tribe
const_1766$year

# The tribe with the shortest judicial section in their constitution (excluding tribes who do not have one) is the Lower Sioux Indian Community in Minnesota (approved in 2007) at 46 words. 
judiciary_sect<-constitutions[constitutions$judiciary.sect.words!=0,]
min(judiciary_sect$judiciary.sect.words)
const_46<-constitutions[constitutions$judiciary.sect.words=="46",]
const_46$tribe
const_46$year

# Figure 4 displays the distribution of judicial section length (i.e. word count) in our sample across tribes, with a mean of 231 words.
round(mean(constitutions$judiciary.sect.words),0)

# For example, for the St. Croix Chippewa Indians of Wisconsin, the terms included in our judiciary dictionary appear only 8 times in the constitution (approved in 1942). 
min(constitutions$judiciary.dict.words)
const_8<-constitutions[constitutions$judiciary.dict.words=="8",]
const_8$tribe
const_8$year

# The Menominee of Wisconsin unit has the greatest amount of terms associated with the judiciary in the constitution (approved in 1992), with terms in dictionary appearing 286 times. 
max(constitutions$judiciary.dict.words)
const_286<-constitutions[constitutions$judiciary.dict.words=="286",]
const_286$tribe
const_286$year

# Figure 5 displays the distribution of the number of terms included in our judicial dictionary appearing in a constitution with a mean value of 57.
round(mean(constitutions$judiciary.dict.words),0)

# Since 52% of the constitutions in our sample do not have a dedicated section in their constitution on the judiciary, this group of tribes score a 0 on this measure. 
summary(constitutions$independ.dict.words==0)
round(50/97*100,0)

# The Turtle Mountain Band of Chippewas of North Dakota includes the greatest number of references to an independent judiciary (with terms in our judicial independence dictionary appearing in the judicial section 93 times). 
max(constitutions$independ.dict.words)
const_93<-constitutions[constitutions$independ.dict.words=="93",]
const_93$tribe

# Their constitution approved in 2006 mentions the term ‘independ’ 3 times, ‘elect’ 24 times and ‘term’ 11 times. 
const_93$year
const_93_corpus <- Corpus(VectorSource(const_93$judiciary.sect.dict.text))
independ_elect_term <- c("independ","elect","term")
const_93_dtm <- DocumentTermMatrix(const_93_corpus, list(wordLengths = c(1, Inf), dictionary=independ_elect_term))
const_93_dtm<-as.matrix(const_93_dtm)
const_93_dtm

# In comparison, the tribes with the least amount of terms included in our judicial independence dictionary appearing in a judicial section (excluding tribes which do not have one) is the Lower Sioux Indian Community in Minnesota and the Tule River of the Tule River Reservation in California. 
min(judiciary_sect$independ.dict.words)
const_1<-judiciary_sect[judiciary_sect$independ.dict.words=="1",]
const_1$tribe 

# The only term included in our dictionary that appears in their constitution approved in 2007 and 1974 is ‘author’ just once. 
const_1$year
const_1_corpus <- Corpus(VectorSource(const_1$judiciary.sect.dict.text))
independ_dict <- c("appoint","appointe","author","compens","decid","decis","disqualif","disqualifi","elect","elector","elig","independ","inelig","power","qualif","qualifi","reappoint","remov","salari","select","serv","term","vote","voter")
const_1_dtm <- DocumentTermMatrix(const_1_corpus, list(wordLengths = c(1, Inf), dictionary=independ_dict))
const_1_dtm<-as.matrix(const_1_dtm)
const_1_dtm

# Figure 6 displays the distribution of the number of terms included in our independent judicial dictionary appearing in the judicial section of a constitution, with a mean value of 9.
round(mean(constitutions$independ.dict.words),0)

# Unpacking the constitutions qualitatively helps illustrate the validity of our measure. If we compare constitutions which are in the 80th percentile of our judicial independence measure with those at the 60th percentile, a clear difference becomes apparent in how detailed judiciary independence is specified. The constitutions in the 60th percentile contain topics in their judiciary section such as definition of the establishment of judicial institutions, selection procedure and number of judges.  Constitutions in the 80th percentile contain all of those topics, but also include topics such as salary insulation, limitations to removal procedures, and the power for judges to review and hold to account the actions of the governing council.
# Calculated by hand

# This is the case for all 38 constitutions in our sample that overlap with the Cornell & Kalt (2000) data.
table(judiciary_repl_dict)

# This is the case for only 5% of constitutions in our sample (2), with the remaining 95% not including the term ‘independent’ in the judicial section of the constitution (36). 
table(independ_repl_dict)
round(2/nrow(const_overlap)*100,0)

# For example, when we search for all terms included in our judicial independence dictionary, the precision and recall improve (correctly identifying 80% of the true positives identified by Cornell & Kalt 2000) but the accuracy decreases as our models assign the value of 1 to constitutions that Cornell & Kalt (2000) coded 0.
independ_repl_dict_all_dtm <- DocumentTermMatrix(judiciary_sect_overlap_corpus, list(wordLengths = c(1, Inf), dictionary=independ_dict))
independ_repl_dict_all<-rowSums(as.matrix(independ_repl_dict_all_dtm)) 
independ_repl_dict_all <- as.vector(ifelse(independ_repl_dict_all>=1, 1, 0))
confusion_matrix_independ_repl_dict_all<-table(independ_repl_dict_all,const_overlap$independ.ck)
TP_independ_repl_dict_all <- confusion_matrix_independ_repl_dict_all[2,2]
TN_independ_repl_dict_all <- confusion_matrix_independ_repl_dict_all[1,1]
FP_independ_repl_dict_all <- confusion_matrix_independ_repl_dict_all[1,2]
FN_independ_repl_dict_all <- confusion_matrix_independ_repl_dict_all[2,1]
accuracy_independ_repl_dict_all <- (TP_independ_repl_dict_all + TN_independ_repl_dict_all) / (TP_independ_repl_dict_all + TN_independ_repl_dict_all + FP_independ_repl_dict_all + FN_independ_repl_dict_all)
false_negative_rate_independ_repl_dict_all <- FN_independ_repl_dict_all / (TP_independ_repl_dict_all + TN_independ_repl_dict_all + FP_independ_repl_dict_all + FN_independ_repl_dict_all)
false_positive_rate_independ_repl_dict_all <- FP_independ_repl_dict_all / (TP_independ_repl_dict_all + TN_independ_repl_dict_all + FP_independ_repl_dict_all + FN_independ_repl_dict_all)
precision_independ_repl_dict_all <- (TP_independ_repl_dict_all) / (TP_independ_repl_dict_all + FP_independ_repl_dict_all)
recall_independ_repl_dict_all <- (TP_independ_repl_dict_all) / (TP_independ_repl_dict_all + FN_independ_repl_dict_all)
round(precision_independ_repl_dict_all*100, 3)
round(recall_independ_repl_dict_all*100, 3)

# For the supervised machine learning method, these results are likely due to the small number of constitutions assigned a 1 by Cornell & Kalt (2000) that the model is trained on (only 5 constitutions) and similarities between those constitutions in the training data assigned a 1 and 0.
table(const_overlap$independ.ck==1)

# Footnote 7: The weakest (positive) correlation is between constitution length and independent judicial dictionary measure at 0.48. The constitution length and judicial section length are also weakly (positively) correlated at 0.52.
round(table_A2,3)

# Footnote 10: However, we obtain the same level of agreement when we account for all terms in our judiciary dictionary as constitutions that mention the structure of the judiciary tend to include more general terms related to the judiciary.
judiciary_repl_dict_all_dtm <- DocumentTermMatrix(const_overlap_corpus, list(wordLengths = c(1, Inf), dictionary=judiciary_dict))
judiciary_repl_dict_all<-rowSums(as.matrix(judiciary_repl_dict_all_dtm)) 
judiciary_repl_dict_all <- as.vector(ifelse(judiciary_repl_dict_all>=1, 1, 0))
confusion_matrix_judiciary_repl_dict_all<-table(judiciary_repl_dict_all,const_overlap$judiciary.ck)
zero <- matrix(c(0,0),ncol=2,byrow=TRUE)
colnames(zero) <- c("0","1")
rownames(zero) <- c("0")
zero <- as.table(zero)
confusion_matrix_judiciary_repl_dict_all<-rbind(zero,confusion_matrix_judiciary_repl_dict_all) 
TP_judiciary_repl_dict_all <- confusion_matrix_judiciary_repl_dict_all[2,2]
TN_judiciary_repl_dict_all <- confusion_matrix_judiciary_repl_dict_all[1,1]
FP_judiciary_repl_dict_all <- confusion_matrix_judiciary_repl_dict_all[1,2]
FN_judiciary_repl_dict_all <- confusion_matrix_judiciary_repl_dict_all[2,1]
accuracy_judiciary_repl_dict_all <- (TP_judiciary_repl_dict_all + TN_judiciary_repl_dict_all) / (TP_judiciary_repl_dict_all + TN_judiciary_repl_dict_all + FP_judiciary_repl_dict_all + FN_judiciary_repl_dict_all)
round(accuracy_judiciary_repl_dict_all*100, 3)

# Footnote 10: Accordingly, when we account for all terms in our judicial independence dictionary we obtain lower levels of agreement as there are many more constitutions in our sample that include terms associated with an independent judiciary as op-posed to explicitly describing the judiciary as ‘independent’.
round(accuracy_independ_repl_dict_all*100, 3)
